package au.com.acpfg.misc.uniprot;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.knime.core.data.DataCell;
import org.knime.core.data.DataColumnSpec;
import org.knime.core.data.DataColumnSpecCreator;
import org.knime.core.data.DataRow;
import org.knime.core.data.DataTableSpec;
import org.knime.core.data.DataType;
import org.knime.core.data.container.DataContainer;
import org.knime.core.data.def.DefaultRow;
import org.knime.core.data.def.JoinedRow;
import org.knime.core.data.def.StringCell;
import org.knime.core.node.CanceledExecutionException;
import org.knime.core.node.ExecutionContext;
import org.knime.core.node.InvalidSettingsException;
public class AccessionMapTask implements UniProtTaskInterface {
private static final String base = "http://www.uniprot.org";
private String m_from_db, m_to_db;
private static boolean m_warn_again = true;
// XML is not available for this service, so no support is required
private static HashMap<String, String> m_db = new HashMap<String, String>();
static {
// only bi-directional maps are statically initialized, one-way maps are only available via get_db_list()
m_db.put("UniParc", "UPARC");
m_db.put("UniRef50", "NF50");
m_db.put("UniRef90", "NF90");
m_db.put("UniRef100", "NF100");
m_db.put("EMBL/GenBank/DDBJ", "EMBL_ID");
m_db.put("EMBL/GenBank/DDBJ CDS", "EMBL");
m_db.put("PIR", "PIR");
m_db.put("UniGene", "UNIGENE_ID");
m_db.put("Entrez Gene", "P_ENTREZ_GENEID");
m_db.put("GI number", "P_GI");
m_db.put("IPI", "P_IPI");
m_db.put("RefSeq", "P_REFSEQ_AC");
m_db.put("PDB", "PDB_ID");
m_db.put("DisProt", "DISPROT_ID");
m_db.put("HSSP", "HSSP_ID");
m_db.put("DIP", "DIP_ID");
m_db.put("MINT", "MINT_ID");
m_db.put("MEROPS", "MEROPS_ID");
m_db.put("PeroxiBase", "PEROXIBASE_ID");
m_db.put("PptaseDB", "PPTASEDB_ID");
m_db.put("REBASE", "REBASE_ID");
m_db.put("TCDB", "TCDB_ID");
m_db.put("Aarhus/Ghent-2DPAGE", "AARHUS_GHENT_2DPAGE_ID");
m_db.put("ECO2DBASE_ID", "ECO2DBASE_ID");
m_db.put("World-2DPAGE", "WORLD_2DPAGE_ID");
m_db.put("Ensembl", "ENSEMBL_ID");
m_db.put("Ensembl Protein", "ENSEMBL_PRO_ID");
m_db.put("Ensembl Transcript", "ENSEMBL_TRS_ID");
m_db.put("Ensemble Genomes", "ENSEMBLGENOME_ID");
m_db.put("Ensemble Genomes Protein", "ENSEMBLGENOME_PRO_ID");
m_db.put("Ensemble Genomes Transcript", "ENSEMBLEGENOME_TRS_ID");
m_db.put("GeneID", "P_ENTREZGENEID");
m_db.put("GenomeReviews", "GENOMEREVIEWS_ID");
m_db.put("KEGG", "KEGG_ID");
m_db.put("TIGR", "TIGR_ID");
m_db.put("UCSC", "UCSC_ID");
m_db.put("VectorBase", "VECTORBASE_ID");
m_db.put("AGD", "AGD_ID");
m_db.put("ArachnoServer", "ARACHNOSERVER_ID");
m_db.put("CGD", "CGD");
m_db.put("ConoServer", "CONOSERVER_ID");
m_db.put("CYGD", "CYGD_ID");
m_db.put("dictyBase", "DICTYBASE_ID");
m_db.put("EchoBASE", "ECHOBASE_ID");
m_db.put("EcoGene", "ECOGENE_ID");
m_db.put("euHCVdb", "EUHCVDB_ID");
m_db.put("EuPathDB", "EUPATHDB_ID");
m_db.put("FlyBase", "FLYBASE_ID");
m_db.put("GeneCards", "GENECARDS_ID");
m_db.put("GeneDB_Spombe", "GENEDB_SPOMBE_ID");
m_db.put("GeneFarm", "GENEFARM_ID");
m_db.put("GenoList", "GENOLIST_ID");
m_db.put("H-InvDB", "H_INVDB_ID");
m_db.put("HGNC", "HGNC_ID");
m_db.put("HPA", "HPA_ID");
m_db.put("LegioList", "LEGIOLIST_ID");
m_db.put("Leproma", "LEPROMA_ID");
m_db.put("MaizeGDB", "MAIZEGDB_ID");
m_db.put("MIM", "MIM_ID");
m_db.put("MGI", "MGI_ID");
m_db.put("NMPDR", "NMPDR_ID");
m_db.put("Orphanet", "ORPHANET_ID");
m_db.put("PharmGKB", "PHARMGKB_ID");
m_db.put("PseudoCAP", "PSEUDOCAP_ID");
m_db.put("RGD", "RGD_ID");
m_db.put("SGD", "SGD_ID");
m_db.put("TAIR", "TAIR_ID");
m_db.put("TubercuList", "TUBERCULIST_ID");
m_db.put("WormBase", "WORMBASE_ID");
m_db.put("WormBase Transcript", "WORMBASE_TRS_ID");
m_db.put("WormBase Protein", "WORMBASE_PRO_ID");
m_db.put("Xenbase", "XENBASE_ID");
m_db.put("ZFIN", "ZFIN_ID");
m_db.put("eggNOG", "EGGNOG_ID");
m_db.put("HOGENOM", "HOGENOM_ID");
m_db.put("HOVERGEN", "HOVERGEN_ID");
m_db.put("OMA", "OMA_ID");
m_db.put("OrthoDB", "ORTHODB_ID");
m_db.put("ProtClustDB", "PROTCLUSTDB_ID");
m_db.put("BioCyc", "BIOCYC_ID");
m_db.put("Reactome", "REACTOME_ID");
m_db.put("CleanEx", "CLEANEX_ID");
m_db.put("GermOnline", "GERMONLINE_ID");
m_db.put("DrugBank", "DRUGBANK_ID");
m_db.put("NextBio", "NEXTBIO_ID");
};
public AccessionMapTask(String from_db, String to_db) throws Exception {
if (!from_db.equals("UniProtKB AC/ID") && !m_db.containsKey(from_db) ) {
throw new InvalidSettingsException("From database is not valid: "+from_db);
} else {
m_from_db = m_db.get(from_db);
if (m_from_db == null) { // uniprot one-way database?
if (from_db.equals("UniProtKB AC/ID")) {
m_from_db = "ACC+ID";
} else {
throw new InvalidSettingsException("From database is not valid: "+from_db);
}
}
}
if (!to_db.startsWith("UniProt") && !m_db.containsKey(to_db)) {
throw new InvalidSettingsException("To database is not valid: "+to_db);
} else {
m_to_db = m_db.get(to_db);
if (m_to_db == null) {
if (to_db.equals("UniProtKB AC")) {
m_to_db = "ACC";
} else if (to_db.equals("UniProtKB ID")) {
m_to_db = "ID";
} else {
throw new InvalidSettingsException("To database is not valid: "+to_db);
}
}
}
}
public static List<String> get_db_list(boolean from_db) {
ArrayList<String> l = new ArrayList<String>();
l.addAll(m_db.keySet());
if (from_db) {
l.add("UniProtKB AC/ID");
} else {
l.add("UniProtKB AC");
l.add("UniProtKB ID");
}
Collections.sort(l);
return l;
}
private String execute(String tool, NameValuePair[] params) throws Exception {
HttpClient client = new HttpClient();
String location = base + '/' + tool + '/';
try {
HttpMethod method = new PostMethod(location);
((PostMethod) method).addParameters(params);
method.setFollowRedirects(false);
int status = client.executeMethod(method);
//Logger.getAnonymousLogger().info(HttpStatus.getStatusText(status));
if (status == HttpStatus.SC_MOVED_TEMPORARILY) {
location = method.getResponseHeader("Location").getValue();
method.releaseConnection();
method = new GetMethod(location);
status = client.executeMethod(method);
}
while (true)
{
int wait = 0;
Header header = method.getResponseHeader("Retry-After");
if (header != null)
wait = Integer.valueOf(header.getValue());
if (wait == 0)
break;
Thread.sleep(wait * 1000);
method.releaseConnection();
method = new GetMethod(location);
status = client.executeMethod(method);
}
if (status == HttpStatus.SC_OK) {
String ret = method.getResponseBodyAsString();
method.releaseConnection();
return ret;
}
} catch (Exception e) {
e.printStackTrace();
throw e;
}
return null;
}
@Override
public DataTableSpec getTableSpec(boolean want_xml) {
DataColumnSpec[] cols = new DataColumnSpec[2];
cols[1] = new DataColumnSpecCreator("UniProt: Output Accession ("+m_to_db+")", StringCell.TYPE).createSpec();
cols[0] = new DataColumnSpecCreator("UniProt: Input Accession ("+m_from_db+")", StringCell.TYPE).createSpec();
return new DataTableSpec(cols);
}
@Override
public int run(String[] accsns, DataRow[] in_rows, DataContainer out) throws Exception {
int batch_size = 20;
HashMap<String,String> map = new HashMap<String,String>();
for (int i=0; i<accsns.length; i += batch_size) {
String query_str = "";
for (int j=0; j<batch_size; j++) {
if (i+j < accsns.length) {
query_str += accsns[i+j] + " ";
}
};
String txt = execute("mapping", new NameValuePair[] {
new NameValuePair("from", m_from_db),
new NameValuePair("to", m_to_db),
new NameValuePair("format", "tab"),
new NameValuePair("query", query_str) }
);
// Logger.getAnonymousLogger().info(txt);
BufferedReader sr = new BufferedReader(new StringReader(txt));
String line;
boolean first = true;
while ((line = sr.readLine()) != null) {
String[] tokens = line.split("\\s+");
if (!first && tokens.length == 2) {
map.put(tokens[0], tokens[1]);
}
first = false;
}
}
int n_hits = 0;
if (in_rows != null) {
int idx = 0;
for (DataRow in : in_rows) {
DataCell[] cells = new StringCell[2];
boolean has_map = map.containsKey(accsns[idx]);
if (has_map) {
cells[0] = new StringCell(accsns[idx]);
cells[1] = new StringCell(map.get(accsns[idx]));
n_hits++;
} else {
cells[0] = DataType.getMissingCell();
cells[1] = DataType.getMissingCell();
}
DataRow r = new DefaultRow(in_rows[idx].getKey(), cells); // cells must match tablespec
out.addRowToTable(new JoinedRow(in_rows[idx++], r));
n_hits++;
}
} else {
int hit = 1;
for (String key : map.keySet()) {
DataCell[] cells = new StringCell[2];
cells[0] = new StringCell(key);
cells[1] = new StringCell(map.get(key));
out.addRowToTable(new DefaultRow("Hit"+hit++, cells));
n_hits++;
}
}
return n_hits;
}
@Override
public String fix_accsn(String in_accsn) throws Exception {
return in_accsn.trim();
}
@Override
public void cleanup() throws Exception {
// NO-OP
}
@Override
public void pause(ExecutionContext exec, double progress, String msg)
throws InterruptedException, CanceledExecutionException {
// TODO: no cache for this task, so mandatory pause for now....
exec.checkCanceled();
exec.setProgress(progress, "Pause to be nice to UniProt servers (20sec. delay)");
Thread.sleep(20 * 1000);
}
}